# Wrangling
library(tidyverse)
library(mgsub)
library(naniar)
# Statistics/Numerical processing
library(brms)
# PCA
library(FactoMineR)
library(factoextra)
library(GPArotation)
library(paran)
# Plotting
library(ggplot2)
library(ghibli)
# Optional settings
options(dplyr.summarise.inform=F) # Stop dplyr from printing summarise error (that isn't an error)
select <- dplyr::select # Ensure that select() command is the dplyr command (clashes with MASS, which is imported/required by paran)
# Standard error function
std.error <- function(x, na.rm = T) {
sqrt(var(x, na.rm = na.rm)/length(x[complete.cases(x)]))
}
# ggplot theme
gg_theme <- function() {
theme_bw() +
theme(plot.title=element_text(size=25),
plot.subtitle=element_text(size=15, face="italic"),
axis.title=element_text(size=20),
axis.text=element_text(size=15),
strip.background =element_rect(fill="white"),
strip.text = element_text(size=15))+
theme(legend.title = element_text(size=15, face="bold"),
legend.text=element_text(size=15))
}
Read in the data from downloaded CSV files from Firebase. Also remove data from subjects who did not complete the study (“returned” on Prolific) or have been identified to be outliers, not following instructions, not fulfilling my participant requirements, etc. Outliers are dentified in a later section below (Outlier Check), while participant requirements are checked in the data from the questionnaire.
# List results files per subject
filelist <- list.files(path="./data/axb_1b/perception/", pattern=".csv",full.names=TRUE)
# Check Number of files
paste("Total number of perception results files in directory:",length(filelist))
[1] "Total number of perception results files in directory: 101"
paste("Expected number of questionnaire files:",length(filelist)-2-2) # -2 (repeats) -2 (returns)
[1] "Expected number of questionnaire files: 97"
In this case, when reading in the data by participant file, the experiment start time is extracted from the structure ./data/axb_XX/perception/subjnumber_starttime.csv where I first remove the “.csv” with gsub, split the string into three based on "_", then select the third item from the first output object ([[1]][3]).
# Read and Concatenate results
## (1) Just read and concat
# condata.read <- do.call(rbind, lapply(filelist, read.csv))
## (2) Read, concat and extract part of filename
condata.read <- do.call(rbind, lapply(filelist, function(x) cbind(read.csv(x),
starttime=strsplit(gsub(".csv","",x), "_")[[1]][3])))
## TODO: Update with subject numbers as necessary
condata.read <- condata.read %>%
# Fix data with undefined subject
mutate(participantId = replace_na(participantId, '5d1e2045a37a4d001a1fc2cb')) %>%
# Remove data from dropped subjects
subset(participantId != '5a68d1c1c0d83600010821e0') %>% # did not finish -- RETURNED
subset(participantId != '5fcd0ee406bd7ab94ecc424c') %>% # spokane + 50% too fast -- RETURNED
subset(participantId != '5e67f0321e4f0a0a657c1d08') %>% # not michigan - florida to 22
subset(participantId != '5f9f96c7ea7f965f92f5d488') %>% # not michigan - colorado to 18
subset(participantId != '5ffa01eb7dfb43387a868196') %>% # not michigan - ny to 28
subset(participantId != '5fee8cf38ac18214460c59ed') %>% # not michigan - pa to 15
subset(participantId != '5699d25625d9e9000db0c7cc') %>% # 168 + 85 guy... did twice(?) and skipped all
subset(participantId != '601f04fb235cb14020c7a96f') %>% # did twice, once with condA then condC, so can't use even full data
subset(participantId != '5fe0e7892738afa6a21cfd7c') %>% # did not finish study/questionnaire -- RETURNED
# subset(participantId != '5fc511c32b9bc60bc189b893') %>% # alien guy --- KEPT in the end
# Remove levels of dropped subjects
droplevels()
# Trim unnecesssary columns and rows
condata <- subset(condata.read,,-c(url, internal_node_id, view_history, stimulus, success, key_press, trial_index,
trial_type, wordStim))
# Keep only tonetest and test rows
# Create subject number column by order of participation (first sorting subjects by starttime, then adding subjnum column)
# Recover vowel data from sentNum column (== Vowel data was accidently left out of stimuli data)
# Manipulate data types
# Reorder columns
condata <- condata %>%
subset(trial_role == "test" | trial_role == "tonetest") %>%
#arrange(desc(starttime)) %>%
#mutate(subjNum = as.factor(rep(1:nSubj, each=(nrow(.)/nSubj)))) %>%
mutate(vowel = case_when(between(sentNum, 11, 20) ~ "AU",
between(sentNum, 21, 30) ~ "AI")) %>%
mutate(guiseCombination = case_when(conditionId == "condA" | conditionId == "condB" ~ "match",
conditionId == "condC" | conditionId == "condD" ~ "mismatch",
conditionId == "condE" | conditionId == "condF" ~ "baseline")) %>%
mutate(speakerOrder = case_when(conditionId == "condA" | conditionId == "condC" | conditionId == "condE" ~ "S3-S9",
conditionId == "condB" | conditionId == "condD" | conditionId == "condF" ~ "S9-S3")) %>%
mutate(rt=as.numeric(rt)) %>%
mutate_if(is.character, as.factor) %>%
mutate(time_elapsed_sec = time_elapsed/1000, rt_sec = rt/1000) %>%
select(participantId, conditionId, trial_role, time_elapsed, time_elapsed_sec, rt, rt_sec, everything())
# Check data
condata
#summary(condata)
# Check number of data points per subject
# Correct number of data points is 174/180/186 = (168 + 6/12/18)
(nData.bysubj <- condata %>%
group_by(conditionId, participantId, starttime) %>%
count())
# Check number of data points per condition (goal: 40 per condition)
(nSubj.bycond <- nData.bysubj %>%
group_by(conditionId) %>%
count())
Troubleshoot participant issues manually and/or by filtering duplicate subject files or those with too few responses.
# Manually scroll to check data as needed
View(nData.bysubj)
# Troubleshoot duplicated participants
## Calculate number of subjects vs data files
paste("Total number of screened perception data files:",nrow(nData.bysubj)) # number of data files
[1] "Total number of screened perception data files: 90"
paste("Total number of unique participant numbers:",length(unique(condata.read$participantId))) # unique subj nums
[1] "Total number of unique participant numbers: 90"
## Identify dups
nData.bysubj$dups = duplicated(nData.bysubj$participantId)
nData.bysubj %>% filter(dups==TRUE)
# Troubleshoot half data
nData.bysubj %>% filter(n<174)
NA
# select "Tonetest" data
# Check which participants got 4/6 or below on the headphone/attention check
condata.tones <- condata %>%
subset(trial_role == "tonetest") %>%
mutate(correct_response=tolower(correct_response)) %>%
group_by(participantId, conditionId) %>%
slice_max(order_by = time_elapsed, n = 6) # get last 6 tone trials, by largest time_elapsed
condata.tones <- condata.tones %>%
group_by(participantId, conditionId) %>%
summarise(tonesCorrect = sum(correct_response=="true")) %>%
ungroup()
condata.tones
# Troubleshoot half data
condata.tones %>% filter(tonesCorrect<5)
# Load stim.durations.final dataframe
load(file="./data/stim_durations.rData")
# Select "Test" data + remove columns for Tonetest data
# Merge tonesCorrect column in for later decisions (e.g. if remove people with low score in tone test)
condata.test <- condata %>%
subset(trial_role == "test") %>%
subset(.,,-c(button_pressed, correct_answer, correct_response)) %>%
droplevels() %>%
merge(., condata.tones, all.x=TRUE) %>%
merge(., stim.durations.final, all.x=TRUE) %>%
select(participantId, conditionId, trial_role, time_elapsed, time_elapsed_sec, rt, rt_sec, raised_response, everything())
condata.test
# Check data for obvious issues
summary(condata.test)
participantId conditionId trial_role time_elapsed time_elapsed_sec rt rt_sec
56d9f54b610fc0000bb76e8d: 168 condA:5376 test:15120 Min. : 73779 Min. : 73.78 Min. : 128 Min. : 0.128
56e72067c89073000be77fdf: 168 condC:4536 1st Qu.: 499867 1st Qu.: 499.87 1st Qu.: 3171 1st Qu.: 3.171
59c5689f46f72100019066a7: 168 condE:5208 Median : 751400 Median : 751.40 Median : 3520 Median : 3.520
5a412f0b99311d0001df431e: 168 Mean : 864770 Mean : 864.77 Mean : 3743 Mean : 3.743
5b14791ebd9c31000156e8ab: 168 3rd Qu.:1005121 3rd Qu.:1005.12 3rd Qu.: 3965 3rd Qu.: 3.965
5bef64b373e44f0001092811: 168 Max. :4908639 Max. :4908.64 Max. :44512 Max. :44.512
(Other) :14112
raised_response vowel speaker sentNum order step speakerIdentity speakerGuise guiseName raised_answer
false:7296 AI:7560 S3:15120 Min. :18.00 axb:7560 Min. :2 MI:15120 BL:5208 S3-BL:5208 Min. :0.0
true :7824 AU:7560 1st Qu.:19.00 bxa:7560 1st Qu.:3 CN:4536 S3-CN:4536 1st Qu.:0.0
Median :20.50 Median :5 MI:5376 S3-MI:5376 Median :0.5
Mean :21.67 Mean :5 Mean :0.5
3rd Qu.:22.00 3rd Qu.:7 3rd Qu.:1.0
Max. :30.00 Max. :8 Max. :1.0
key_response starttime guiseCombination speakerOrder tonesCorrect dur_sec dur_ms quart_dur_sec
Min. :0.0000 1612319713122: 168 baseline:5208 S3-S9:15120 Min. :0.000 Min. :1.799 Min. :1799 Min. :1.349
1st Qu.:0.0000 1612320743251: 168 match :5376 1st Qu.:5.000 1st Qu.:2.176 1st Qu.:2176 1st Qu.:1.632
Median :1.0000 1612320747221: 168 mismatch:4536 Median :6.000 Median :2.406 Median :2406 Median :1.804
Mean :0.5964 1612322477195: 168 Mean :5.522 Mean :2.415 Mean :2415 Mean :1.811
3rd Qu.:1.0000 1612323119149: 168 3rd Qu.:6.000 3rd Qu.:2.682 3rd Qu.:2682 3rd Qu.:2.011
Max. :1.0000 1612323133046: 168 Max. :6.000 Max. :3.022 Max. :3022 Max. :2.266
(Other) :14112
quart_dur_ms
Min. :1349
1st Qu.:1632
Median :1804
Mean :1811
3rd Qu.:2011
Max. :2266
# Check original data points
(datapoints.og <- nrow(condata.test))
[1] 15120
# What are the descriptive stats on total experiment time and reaction times?
condata.test.bysubj <- condata.test %>%
group_by(participantId, conditionId) %>%
summarize(time_elapsed_min = max(time_elapsed_sec/60), mean_rt_sec = mean(rt_sec), min_rt_sec = min(rt_sec), max_rt_sec = max(rt_sec), sd_rt_sec = sd(rt_sec)) %>%
ungroup()
head(condata.test.bysubj)
# Summary of total experiment times
# Check for especially short or long times
condata.test.overall <- condata.test.bysubj %>%
summarize(median_time = median(time_elapsed_min), mean_time= mean(time_elapsed_min), min_time = min(time_elapsed_min), max_time = max(time_elapsed_min))
condata.test.overall
# RT Outlier check
# Calculate response times that are at least 3 SDs away from the mean
condata.test.timesum <- condata.test %>%
summarize(meanTime = mean(rt), sdTime = sd(rt), minTime = min(rt), maxTime = max(rt), medianTime = median(rt), iqrTime = IQR(rt), meanStimTime = mean(dur_ms)) %>%
mutate(sd3 = sdTime*3, iqr3 = iqrTime*3, stimTime10 = meanStimTime+10000)
condata.test.timesum
# Add columns of outlier criteria
condata.test.check <- condata.test %>%
mutate(rt.outlier.lower = rt < quart_dur_ms, rt.outlier.upper = rt > (dur_ms + 10000))
# Check list of outliers that were removed
condata.test.outliers <- condata.test.check %>%
subset(rt.outlier.lower == TRUE | rt.outlier.upper == TRUE)
summary(condata.test.outliers)
participantId conditionId trial_role time_elapsed time_elapsed_sec rt rt_sec raised_response
5e42f74f5b772a18434cabf7:14 condA:13 test:74 Min. : 183267 Min. : 183.3 Min. : 128.0 Min. : 0.1280 false:39
5fc511c32b9bc60bc189b893: 7 condC:26 1st Qu.: 568489 1st Qu.: 568.5 1st Qu.: 868.5 1st Qu.: 0.8685 true :35
5fc769df1f4e27017a638e8e: 5 condE:35 Median : 845758 Median : 845.8 Median : 7299.0 Median : 7.2990
5fdf9d13a6a9ed7d8efd0b69: 4 Mean : 880377 Mean : 880.4 Mean :10097.6 Mean :10.0976
60053227125e504142df91e9: 4 3rd Qu.:1052409 3rd Qu.:1052.4 3rd Qu.:16339.5 3rd Qu.:16.3395
5dd27daedba63428af7caf09: 3 Max. :2055174 Max. :2055.2 Max. :44512.0 Max. :44.5120
(Other) :37
vowel speaker sentNum order step speakerIdentity speakerGuise guiseName raised_answer key_response
AI:36 S3:74 Min. :18.00 axb:39 Min. :2.000 MI:74 BL:35 S3-BL:35 Min. :0.000 Min. :0.0000
AU:38 1st Qu.:19.00 bxa:35 1st Qu.:4.000 CN:26 S3-CN:26 1st Qu.:0.000 1st Qu.:0.0000
Median :20.00 Median :5.000 MI:13 S3-MI:13 Median :0.000 Median :1.0000
Mean :21.89 Mean :4.919 Mean :0.473 Mean :0.5135
3rd Qu.:22.00 3rd Qu.:6.750 3rd Qu.:1.000 3rd Qu.:1.0000
Max. :30.00 Max. :8.000 Max. :1.000 Max. :1.0000
starttime guiseCombination speakerOrder tonesCorrect dur_sec dur_ms quart_dur_sec quart_dur_ms
1612323462402:14 baseline:35 S3-S9:74 Min. :0.000 Min. :1.799 Min. :1799 Min. :1.349 Min. :1349
1612320747221: 7 match :13 1st Qu.:5.000 1st Qu.:2.176 1st Qu.:2176 1st Qu.:1.632 1st Qu.:1632
1612406509940: 5 mismatch:26 Median :6.000 Median :2.540 Median :2540 Median :1.905 Median :1905
1612323119149: 4 Mean :5.581 Mean :2.432 Mean :2432 Mean :1.824 Mean :1824
1612366388975: 4 3rd Qu.:6.000 3rd Qu.:2.682 3rd Qu.:2682 3rd Qu.:2.011 3rd Qu.:2011
1612372452274: 3 Max. :6.000 Max. :3.022 Max. :3022 Max. :2.266 Max. :2266
(Other) :37
rt.outlier.lower rt.outlier.upper
Mode :logical Mode :logical
FALSE:37 FALSE:37
TRUE :37 TRUE :37
# Summarize number of outliers attributed to each participant
condata.outliers.bysubj <- condata.test.outliers %>% #filter(conditionId=="condC") %>%
group_by(participantId, conditionId) %>%
count(sort=TRUE)
condata.outliers.bysubj
View(condata.test.outliers)
# A
4/168 #0.02380952 subj 60053227125e504142df91e9
[1] 0.02380952
# C
# 85/168 #0.5059524 subj 5699d25625d9e9000db0c7cc **BUT ALSO IN COND E???
14/168 #0.08333333 subj 5e42f74f5b772a18434cabf7 --- all were longer; can keep if necessary
[1] 0.08333333
4/168 #0.02380952 subj 5a68d1c1c0d83600010821e0
[1] 0.02380952
# E
# 168/168 #0.5059524 subj 5699d25625d9e9000db0c7cc **BUT ALSO IN COND C???
# 85/168 #0.5059524 5fcd0ee406bd7ab94ecc424c --- the spokane 18-year-old
7/168 #0.04166667 subj 5fc511c32b9bc60bc189b893 --- all were shorter...; also the alien guy
[1] 0.04166667
5/168 #0.0297619 5fc769df1f4e27017a638e8e
[1] 0.0297619
4/168 #0.02380952 subj 5fdf9d13a6a9ed7d8efd0b69
[1] 0.02380952
Go back to top and remove outlier participants, if necessary. Then rerun everything up to this point.
# Remove outliers
condata.test.final <- setdiff(condata.test.check, condata.test.outliers)
# Group data by StimType (i.e. Speaker-SpeakerGuise-Vowel-SentNum)
condata.test.final <- condata.test.final %>%
unite(token, speaker, sentNum, remove=FALSE) %>%
mutate(word = case_when(token == "S3_21" ~ "bright",
token == "S3_22" ~ "device",
token == "S3_30" ~ "twice",
token == "S9_23" ~ "goodnight",
token == "S9_25" ~ "invite",
token == "S9_29" ~ "sight",
token == "S3_18" ~ "slouch",
token == "S3_19" ~ "without",
token == "S3_20" ~ "workout",
token == "S9_11" ~ "checkout",
token == "S9_18" ~ "sprouts",
token == "S9_20" ~ "workout")) %>%
mutate(item = paste0(speaker,"_",word)) %>%
mutate(respRS = case_when(raised_response == "true" ~ 1,
raised_response == "false" ~ 0)) %>%
unite(stimType_byword, speaker, speakerGuise, vowel, word, remove=FALSE) %>%
unite(stimType_byvowel, speaker, speakerGuise, vowel, remove=FALSE) %>%
mutate(sentNum = as.factor(sentNum)) %>%
mutate(step = (step-5)) %>%
select(participantId, guiseCombination, step, vowel, speakerGuise, speaker, word, speakerOrder, respRS, stimType_byword, stimType_byvowel, everything())
# Summary
summary(condata.test.final)
participantId guiseCombination step vowel speakerGuise speaker word speakerOrder
56d9f54b610fc0000bb76e8d: 168 baseline:5173 Min. :-3.0000000 AI:7524 BL:5173 S3:15046 Length:15046 S3-S9:15046
59c5689f46f72100019066a7: 168 match :5363 1st Qu.:-2.0000000 AU:7522 CN:4510 Class :character
5a412f0b99311d0001df431e: 168 mismatch:4510 Median : 0.0000000 MI:5363 Mode :character
5b14791ebd9c31000156e8ab: 168 Mean : 0.0003988
5bf49e46cade8c0001e8387f: 168 3rd Qu.: 2.0000000
5c1c181289f03500017281f9: 168 Max. : 3.0000000
(Other) :14038
respRS stimType_byword stimType_byvowel conditionId trial_role time_elapsed time_elapsed_sec rt
Min. :0.0000 Length:15046 Length:15046 condA:5363 test:15046 Min. : 73779 Min. : 73.78 Min. : 1704
1st Qu.:0.0000 Class :character Class :character condC:4510 1st Qu.: 499486 1st Qu.: 499.49 1st Qu.: 3171
Median :1.0000 Mode :character Mode :character condE:5173 Median : 751208 Median : 751.21 Median : 3520
Mean :0.5177 Mean : 864693 Mean : 864.69 Mean : 3711
3rd Qu.:1.0000 3rd Qu.:1004309 3rd Qu.:1004.31 3rd Qu.: 3960
Max. :1.0000 Max. :4908639 Max. :4908.64 Max. :12600
rt_sec raised_response token sentNum order speakerIdentity guiseName raised_answer key_response
Min. : 1.704 false:7257 Length:15046 18:2509 axb:7521 MI:15046 S3-BL:5173 Min. :0.0000 Min. :0.0000
1st Qu.: 3.171 true :7789 Class :character 19:2507 bxa:7525 S3-CN:4510 1st Qu.:0.0000 1st Qu.:0.0000
Median : 3.520 Mode :character 20:2506 S3-MI:5363 Median :1.0000 Median :1.0000
Mean : 3.711 21:2511 Mean :0.5001 Mean :0.5968
3rd Qu.: 3.960 22:2507 3rd Qu.:1.0000 3rd Qu.:1.0000
Max. :12.600 30:2506 Max. :1.0000 Max. :1.0000
starttime tonesCorrect dur_sec dur_ms quart_dur_sec quart_dur_ms rt.outlier.lower rt.outlier.upper
1612320743251: 168 Min. :0.000 Min. :1.799 Min. :1799 Min. :1.349 Min. :1349 Mode :logical Mode :logical
1612322477195: 168 1st Qu.:5.000 1st Qu.:2.176 1st Qu.:2176 1st Qu.:1.632 1st Qu.:1632 FALSE:15046 FALSE:15046
1612323133046: 168 Median :6.000 Median :2.272 Median :2272 Median :1.704 Median :1704
1612323295513: 168 Mean :5.522 Mean :2.415 Mean :2415 Mean :1.811 Mean :1811
1612323671492: 168 3rd Qu.:6.000 3rd Qu.:2.682 3rd Qu.:2682 3rd Qu.:2.011 3rd Qu.:2011
1612366088661: 168 Max. :6.000 Max. :3.022 Max. :3022 Max. :2.266 Max. :2266
(Other) :14038
item
Length:15046
Class :character
Mode :character
# Print data
condata.test.final
# Write to file
write.csv(condata.test.final, 'data/axb_1b_exp_data.csv', row.names=F)
# Final kept data points
(datapoints.final <- nrow(condata.test.final))
[1] 15046
# Final removed data points
datapoints.og-datapoints.final
[1] 74
# Calculate percentage of data removed
(datapoints.og-datapoints.final)/datapoints.og # = 0.008852259 = 0.8% of the data were removed due to responses that were too quick (less than 3/4 of the time into the audio file, before the third token would have played) or too slow (over 10 sec after the end of the audio file, an arbitrarily chosen value that should be enough time if a participant were responding as quickly as possible)
[1] 0.00489418
# Get subj means per condition
subj.means <- condata.test.final %>% #filter(participantId!='5e42f74f5b772a18434cabf7') %>%
group_by(participantId, step, vowel, speakerGuise) %>%
summarise(mean.Prop = mean(respRS))
# Get group means and se per condition (by averaging speaker means)
condition.means <- subj.means %>%
group_by(step, vowel, speakerGuise) %>%
summarise(grandM.Prop = mean(mean.Prop), se = std.error(mean.Prop))
# Plot lineplot with error bars on step points
byGuise_prop_plot <- condition.means %>%
ggplot(aes(x = step, y = grandM.Prop)) +
geom_point(stat="identity", aes(colour = factor(speakerGuise)), cex=5) +
geom_line(aes(colour=factor(speakerGuise), linetype=factor(speakerGuise)), lwd=1) +
geom_errorbar(width = .25, aes(ymin = grandM.Prop-se, ymax = grandM.Prop+se,
colour = factor(speakerGuise))) +
facet_grid(~vowel) +
labs(y = "Proportion 'raised' response", x = "Continuum Step (UR to RS)", color="Guise",
linetype = "Guise", title="Raising Perception: By Guise") +
coord_cartesian(ylim=c(0, 1)) +
scale_x_continuous(breaks = -3:3) +
scale_color_manual(values=ghibli_palette("PonyoMedium")[c(6,4,3)]) +
gg_theme()
byGuise_prop_plot
# Get subj means per condition
subj.means <- condata.test.final %>% filter(speaker=='S3') %>% #filter(participantId!='5e42f74f5b772a18434cabf7') %>%
group_by(participantId, step, vowel, speakerGuise) %>%
summarise(mean.rt = mean(log(rt)))
# Get group means and se per condition (by averaging speaker means)
condition.means <- subj.means %>%
group_by(step, vowel, speakerGuise) %>%
summarise(grandM.rt = mean(mean.rt), se = std.error(mean.rt))
# Plot lineplot with error bars on step points
byGuise_rt_plot <- condition.means %>%
ggplot(aes(x = step, y = grandM.rt)) +
geom_point(stat="identity", aes(colour = factor(speakerGuise)), cex=5) +
geom_line(aes(colour=factor(speakerGuise), linetype=factor(speakerGuise)), lwd=1) +
geom_errorbar(width = .25, aes(ymin = grandM.rt-se, ymax = grandM.rt+se, colour = factor(speakerGuise))) +
facet_grid(~vowel) +
labs(y = "Log Response Time", x = "Continuum Step (UR to RS)", color="Guise",
linetype = "Guise", title="Reaction Time: By Guise") +
scale_color_manual(values=ghibli_palette("PonyoMedium")[c(6,4,3)])+
gg_theme()
byGuise_rt_plot
# Get subj means per condition
subj.means <- condata.test.final %>% filter(speaker=="S3") %>%
group_by(participantId, step, vowel, speakerGuise, speaker, word) %>%
summarise(mean.Prop = mean(respRS))
# Get group means and se per condition (by averaging speaker means)
condition.means <- subj.means %>%
group_by(step, vowel, speakerGuise, speaker, word) %>%
summarise(grandM.Prop = mean(mean.Prop), se = std.error(mean.Prop))
# AI
byWord_prop_plot <- condition.means %>% filter(vowel=="AI") %>%
ggplot(aes(x = step, y = grandM.Prop)) +
geom_point(stat="identity", aes(colour = factor(speakerGuise)), cex=5, alpha=0.75) +
geom_line(aes(colour=factor(speakerGuise), linetype=factor(word)), lwd=1) +
geom_errorbar(width = .25, aes(ymin = grandM.Prop-se, ymax = grandM.Prop+se, colour = factor(speakerGuise))) +
facet_grid(speaker~word) +
labs(y = "Proportion 'raised' response", x = "Continuum Step (UR to RS)", color="Guise",
linetype = "Word", title="AI Raising Perception: By Word") +
coord_cartesian(ylim=c(0, 1)) +
scale_x_continuous(breaks = -3:3) +
scale_color_manual(values=ghibli_palette("PonyoMedium")[c(6,4,3)])+
gg_theme()
byWord_prop_plot
# AU
byWord_prop_plot <- condition.means %>% filter(vowel=="AU") %>%
ggplot(aes(x = step, y = grandM.Prop)) +
geom_point(stat="identity", aes(colour = factor(speakerGuise)), cex=5, alpha=0.75) +
geom_line(aes(colour=factor(speakerGuise), linetype=factor(word)), lwd=1) +
geom_errorbar(width = .25, aes(ymin = grandM.Prop-se, ymax = grandM.Prop+se, colour = factor(speakerGuise))) +
facet_grid(speaker~word) +
labs(y = "Proportion 'raised' response", x = "Continuum Step (UR to RS)", color="Guise",
linetype = "Word", title="AU Raising Perception: By Word") +
coord_cartesian(ylim=c(0, 1)) +
scale_x_continuous(breaks = -3:3) +
scale_color_manual(values=ghibli_palette("PonyoMedium")[c(6,4,3)])+
gg_theme()
byWord_prop_plot
(adapted from CantoMergers project)
The Qualtrics output includes a text response file and a numerical response file. Because I want to use text for some questions (e.g. IK2, the word selection question) but numbers for other questions (e.g. familiarity scale questions), I need to work with both.
I have downloaded the files and renamed them as ’_text’ and ’_num" files. Specifically, file names have been renamed from the original Qualtrics download name to final_lbq_num.csv and final_lbq_text.csv.
length(unique(quesdata.read$subjID))
[1] 136
Remove the unnecessarily columns and rows. Also extract the question text while we’re at it (before removing the rows) so that we can refer to the questions as necessary, but they won’t be in the data to be analysed.
# Remove metadata columns (first several)
quesdata <- quesdata.read %>%
select(-c(StartDate, EndDate, Status, IPAddress, Progress, Duration..in.seconds., Finished, RecordedDate, ResponseId, RecipientLastName, RecipientFirstName, RecipientEmail, ExternalReference, LocationLatitude, LocationLongitude, DistributionChannel, UserLanguage, PROLIFIC_PID))
# Question reference (if want to look back at question text)
questions <- quesdata %>% slice(1)
# Remove unecessary question header and test data rows + add/fix relevant info + remove data from dropped subjects
quesdata <- quesdata %>%
# Remove irrelevant rows and removed data
subset(subjID != "Please check that your Prolific ID is correct, then press the 'next' button to continue with the survey." & subjID !="{\"ImportId\":\"QID78_TEXT\"}") %>%
subset(subjID != "preview1") %>%
# Merge with perception data (for convenience, uses the minimal dataframe `condata.tones`)
## Adds subject and condition info + automatically drops subjects that were screened out based on perception experiment performance/returns
rename(participantId = subjID) %>%
merge(., condata.tones) %>%
mutate(guiseCombination = case_when(conditionId == "condA" | conditionId == "condB" ~ "match",
conditionId == "condC" | conditionId == "condD" ~ "mismatch",
conditionId == "condE" | conditionId == "condF" ~ "baseline")) %>%
mutate(speakerOrder = case_when(conditionId == "condA" | conditionId == "condC" | conditionId == "condE" ~ "S3-S9",
conditionId == "condB" | conditionId == "condD" | conditionId == "condF" ~ "S9-S3")) %>%
droplevels()
# Fix answers from the one 'alien' participant
quesdata <- quesdata %>%
mutate(Gender = ifelse(participantId=='5fc511c32b9bc60bc189b893', 'm', Gender),
LingExp = ifelse(participantId=='5fc511c32b9bc60bc189b893', 'none', LingExp),
Loc1_1 = ifelse(participantId=='5fc511c32b9bc60bc189b893', '0', Loc1_1),
Loc1_2 = ifelse(participantId=='5fc511c32b9bc60bc189b893', '1', Loc1_2),
Loc1_3 = ifelse(participantId=='5fc511c32b9bc60bc189b893', 'zeeland', Loc1_3),
Loc1_4 = ifelse(participantId=='5fc511c32b9bc60bc189b893', 'ottawa', Loc1_4),
Loc1_5 = ifelse(participantId=='5fc511c32b9bc60bc189b893', 'mi', Loc1_5),
Loc2_1 = ifelse(participantId=='5fc511c32b9bc60bc189b893', '1', Loc2_1),
Loc2_2 = ifelse(participantId=='5fc511c32b9bc60bc189b893', '2', Loc2_2),
Loc2_3 = ifelse(participantId=='5fc511c32b9bc60bc189b893', 'sedona', Loc2_3),
Loc2_4 = ifelse(participantId=='5fc511c32b9bc60bc189b893', '', Loc2_4),
Loc2_5 = ifelse(participantId=='5fc511c32b9bc60bc189b893', 'arizona', Loc2_5),
Loc3_1 = ifelse(participantId=='5fc511c32b9bc60bc189b893', '2', Loc3_1),
Loc3_2 = ifelse(participantId=='5fc511c32b9bc60bc189b893', '14', Loc3_2),
Loc3_3 = ifelse(participantId=='5fc511c32b9bc60bc189b893', 'glenn', Loc3_3),
Loc3_4 = ifelse(participantId=='5fc511c32b9bc60bc189b893', 'allegan', Loc3_4),
Loc3_5 = ifelse(participantId=='5fc511c32b9bc60bc189b893', 'mi', Loc3_5),
Loc4_1 = ifelse(participantId=='5fc511c32b9bc60bc189b893', '14', Loc4_1),
Loc4_2 = ifelse(participantId=='5fc511c32b9bc60bc189b893', '18', Loc4_2),
Loc4_3 = ifelse(participantId=='5fc511c32b9bc60bc189b893', 'south haven', Loc4_3),
Loc4_4 = ifelse(participantId=='5fc511c32b9bc60bc189b893', 'van buren', Loc4_4),
Loc5_5 = ifelse(participantId=='5fc511c32b9bc60bc189b893', 'mi', Loc4_5),
Loc5_1 = ifelse(participantId=='5fc511c32b9bc60bc189b893', '18', Loc5_1),
Loc5_2 = ifelse(participantId=='5fc511c32b9bc60bc189b893', '23', Loc5_2),
Loc5_3 = ifelse(participantId=='5fc511c32b9bc60bc189b893', 'grand rapids', Loc5_3),
Loc5_4 = ifelse(participantId=='5fc511c32b9bc60bc189b893', 'kent', Loc5_4),
Loc5_5 = ifelse(participantId=='5fc511c32b9bc60bc189b893', 'mi', Loc5_5),
Loc6_1 = ifelse(participantId=='5fc511c32b9bc60bc189b893', '23', Loc6_1),
Loc6_2 = ifelse(participantId=='5fc511c32b9bc60bc189b893', '34', Loc6_2),
Loc6_3 = ifelse(participantId=='5fc511c32b9bc60bc189b893', 'various... grandville', Loc6_3),
Loc6_4 = ifelse(participantId=='5fc511c32b9bc60bc189b893', 'various... kent', Loc6_4),
Loc6_5 = ifelse(participantId=='5fc511c32b9bc60bc189b893', 'various... mi', Loc6_5))
Here’s the table of the question tags, numbers and text. This interactive table allows for sorting and searching! We can use this to check the exact wording of the questions—all of them as they were shown to the participant.
# Print questions for reference
DT::datatable(questions,
options=list(scrollX = TRUE,
autoWidth = TRUE,
columnDefs = list(list(width = '200px', targets = "_all"))))
# Subset to demographic question columns
quesdata.demo <- quesdata %>% filter(dataFormat == "text") %>%
select(conditionId, participantId, guiseCombination, speakerOrder,
Age, Gender, Ethnicity, SpHDisorder, SpHDisorder_2_TEXT, Degree, LingExp, FirstLang,
Loc1_1:Loc6_5) %>%
# Fix spelling errors and variation on demographic questions
mutate_at(vars(-conditionId, -speakerOrder),tolower) %>%
mutate_if(is.character, str_trim)
# Check data
quesdata.demo
# Summary
# summary(quesdata.demo)
# Quick check via table
ques.demo.sum <- quesdata.demo %>% select(conditionId, Age, Gender, Ethnicity, Loc1_5)
with(quesdata.demo, unique(Gender))
[1] "male" "female" "m" "non binary" "woman/female" "cis male" "nonbinary"
with(quesdata.demo, unique(Loc1_5))
[1] "michigan" "mi" "ca" "usa" "michigan/usa" "michigan/us" "mi/" "mi, us"
[9] "" "mi/usa" "ca/usa" "texas" "michigan usa" "michigan, usa"
with(quesdata.demo, unique(Ethnicity))
[1] "white" "asian" "caucasian" "white/hispanic" "african american" "white/caucasian" "american"
[8] "caucasian/white" "black/usa" "caucasion" "hispanic" "white american" "african-american" "middle-eastern"
[15] "sicilian/american" "middle easten" "black" "asian/caucasian" "mezzogiorno" "east asian"
quesdata.demo <- quesdata.demo %>%
mutate(Gender = mgsub(Gender, c("woman.*|female|cis female", "male|cis male", "non-binary|non binary|nonbinary"), c("f", "m", "nb"))) %>%
mutate_at(vars(starts_with("Loc") & ends_with("_5")), ~ mgsub(.x, c("michigan.*", "mi.*"), c("mi", "mi"))) %>%
mutate(Ethnicity = mgsub(Ethnicity, c("caucasian|caucasion|american|mezzogiorno","african american|african-american", "middle easten"), c("white", "black", "middle-eastern"))) %>%
mutate(Ethnicity = mgsub(Ethnicity, c("asian/white","white/hispanic"), c("multiracial","multiracial"), fixed=TRUE)) %>%
mutate(Ethnicity = mgsub(Ethnicity, c(".*white.*", "black.*", ".*asian"), c("white", "black", "asian"))) %>%
# Change vector classes from character class
mutate_at(vars(Age), as.numeric) %>%
# Select
select(conditionId, participantId, everything())
quesdata.demo
# Ethnicity Counts
quesdata.demo %>% group_by(Ethnicity) %>% count()
quesdata.demo %>% group_by(Ethnicity, conditionId) %>% count() %>% pivot_wider(Ethnicity, names_from = conditionId, values_from=n)
NA
# Gender Counts
quesdata.demo %>% group_by(Gender) %>% count()
quesdata.demo %>% group_by(Gender, conditionId) %>% count() %>% pivot_wider(Gender, names_from = conditionId, values_from=n)
NA
# Age
quesdata.demo %>% summarise(n=length(conditionId), minAge = min(Age), maxAge = max(Age), meanAge = mean(Age), sdAge = sd(Age))
quesdata.demo %>% group_by(conditionId) %>%
summarise(n=length(conditionId), minAge = min(Age), maxAge = max(Age), meanAge = mean(Age), sdAge = sd(Age)) %>% ungroup()
NA
quesdata.loc <- quesdata.demo %>% select(participantId, starts_with("Loc"))
quesdata.loc
loc.code <- data.frame(Loc = unique((quesdata.loc$Loc1_3))) %>%
rbind(data.frame(Loc = unique((quesdata.loc$Loc2_3)))) %>%
rbind(data.frame(Loc = unique((quesdata.loc$Loc3_3)))) %>%
rbind(data.frame(Loc = unique((quesdata.loc$Loc4_3)))) %>%
rbind(data.frame(Loc = unique((quesdata.loc$Loc5_3)))) %>%
rbind(data.frame(Loc = unique((quesdata.loc$Loc6_3)))) %>%
unique()
loc.code
# Subset data to text format
quesdata.text <- quesdata %>% filter(dataFormat == "text") %>%
select(participantId, conditionId, guiseCombination, speakerOrder, everything()) %>%
select(-c(Age, Gender, Ethnicity, SpHDisorder, SpHDisorder_2_TEXT, Degree, LingExp, FirstLang,
Loc1_1:Loc6_5, expPurpose)) %>%
mutate_if(is.character, as.factor)
quesdata.text.sub <- quesdata.text %>%
select(-starts_with("Q"), -SC0, -tonesCorrect) %>%
droplevels()
quesdata.text.sub
IK2 (Implicit Knowledge Q2) refers to the question where subjects select all the words they think Canadians and Michiganders would pronounce differently (“Which of the following words, if any, do you think would be pronounced differently by someone from Canada, as opposed to someone from Michigan? Please select all that apply.”). The question includes 30 words, 5 of which are target /au/ words and 5 of which are target /ai/ words.
We want to go from the raw data, the selected words, to the number of target words (/au/ and /ai/ raising words) selected. The response format is words separated by commas in one cell. So, we need to separate the strings into separate words, check whether each target word occured, then tabulate the scores. A simple search for word strings won’t work, because some words (e.g. out) are substrings of other words (e.g. about).
My solution was to first separate the single column by commas into multiple columns, one per word. To check whether the word was a response, I implement a count of 1 in a new column if a search function finds the target string in a row. This is done for each target word. Finally, I sum the count columns for /ai/ and /au/ separately to get a score out of 5.
# IK2: Can Word Selection question
# Select only IK2 question + create copied column of Words (for next step)
ik2 <- select(quesdata.text.sub, participantId, IK2.CanWords)
ik2 <- mutate(ik2, Words = IK2.CanWords)
# Separate Words into columns (by comma) + create new columns of word in list per row
# Number of columns must be the same for every row, so find the max number of words selected by any subject (23 here)
# Each subject will have 23 columns, one word per column (Word_1, Word_2...) until no more words (NA if no word)
ik2 <- separate(ik2, "Words", paste("Word", 1:30, sep="_"), sep=",", extra="drop")
# Identify target words in each subjects' responses
# Count 1 if word exists in row; 0 if none
# /au/ targets
ik2 <- mutate(ik2, IK2.au.out = as.integer(apply(ik2, 1, function(x) any(x %in% "out"))),
IK2.au.about = as.integer(grepl('about',IK2.CanWords)),
IK2.au.devout = as.integer(grepl('devout',IK2.CanWords)),
IK2.au.house = as.integer(grepl('house',IK2.CanWords)),
IK2.au.pouch = as.integer(grepl('pouch',IK2.CanWords)))
# /ai/ targets
ik2 <- mutate(ik2, IK2.ai.like = as.integer(apply(ik2, 1, function(x) any(x %in% "like"))),
IK2.ai.right = as.integer(apply(ik2, 1, function(x) any(x %in% "right"))),
IK2.ai.might = as.integer(apply(ik2, 1, function(x) any(x %in% "might"))),
IK2.ai.unite = as.integer(grepl('unite',IK2.CanWords)),
IK2.ai.ripe = as.integer(apply(ik2, 1, function(x) any(x %in% "ripe"))))
# Sum of targets selected for /au/ and /ai/
ik2 <- mutate(ik2, IK2.au = rowSums(select(ik2, IK2.au.out:IK2.au.pouch)),
IK2.ai = rowSums(select(ik2, IK2.ai.like:IK2.ai.ripe)))
# Here are two versions of the data
# ...with score for each target word + sum of /au/ and /ai/ targets selected
ik2.values <- select(ik2, participantId, IK2.au, IK2.ai, IK2.au.out:IK2.ai.ripe)
# ...with only sum of /au/ and /ai/ targets selected
ik2.sum <- select(ik2, participantId, IK2.au:IK2.ai)
A few cases where participants do not respond to a question because the answer is ‘no’ results in an ‘NA’ entry. These ’NA’s for specific columns are adjusted “manually” to the correct value.
# Subset data to num format
quesdata.num <- quesdata %>% subset(dataFormat == "num") %>%
select(participantId, conditionId, guiseCombination, speakerOrder, everything()) %>%
select(-c(Age, Gender, Ethnicity, SpHDisorder, SpHDisorder_2_TEXT, Degree, LingExp, FirstLang,
Loc1_1:Loc6_5, expPurpose)) %>%
rename(EQ.raws = SC0) %>%
mutate_at(vars(participantId:speakerOrder), as.factor) %>%
mutate_if(~ all(grepl('^\\d+$', .x)), as.numeric)
## Further adjustments
quesdata.num.sub <- quesdata.num %>%
# Convert columns to numeric, leaving non-numeric columns as NA
mutate_if(is.character, as.numeric) %>%
# Remove columns that are all NA (specifically, if Sum of that column's NAs is NOT equal to the number of rows, select )
select_if(colSums(is.na(.)) != nrow(.)) %>%
# Remove columns of certain types of questions or specific question number
select(-starts_with("Q"), -starts_with("Lang"), -starts_with("IK2"), -starts_with("ME1"), -ends_with("TEXT")) %>%
# Adjust values of NA (for cases where they can be interpreted as a 'no' or 'never')
mutate(Travel.NE_Visits = coalesce(Travel.NE_Visits, 1), Travel.S_Visits = coalesce(Travel.S_Visits, 1),
Travel.W_Visits = coalesce(Travel.W_Visits, 1), Travel.Can_Visits = coalesce(Travel.Can_Visits, 1),
Travel.NE_Time = coalesce(Travel.NE_Time, 1), Travel.S_Time = coalesce(Travel.S_Time, 1),
Travel.W_Time = coalesce(Travel.W_Time, 1), Travel.Can_Time = coalesce(Travel.Can_Time, 1),
PE1.Relatives = coalesce(PE1.Relatives, 2), PE1.CloseFamFriends = coalesce(PE1.CloseFamFriends, 2),
EK3.CanAI.Diff = coalesce(EK3.CanAI.Diff, 2), EK4.CanAU.Diff = coalesce(EK4.CanAU.Diff, 2),
ME3.Sources.OtherMedia_1 = coalesce(ME3.Sources.OtherMedia_1, 1),
ME3.Sources.Other_1 = coalesce(ME3.Sources.Other_1, 1),
ME3.Sources.News_1 = coalesce(ME3.Sources.News_1, 1)) %>%
droplevels()
Problem with `mutate()` input `Lang1_1`.
[34mℹ[39m NAs introduced by coercion
[34mℹ[39m Input `Lang1_1` is `.Primitive("as.double")(Lang1_1)`.NAs introduced by coercionProblem with `mutate()` input `Lang1_2`.
[34mℹ[39m NAs introduced by coercion
[34mℹ[39m Input `Lang1_2` is `.Primitive("as.double")(Lang1_2)`.NAs introduced by coercionProblem with `mutate()` input `Lang2_1`.
[34mℹ[39m NAs introduced by coercion
[34mℹ[39m Input `Lang2_1` is `.Primitive("as.double")(Lang2_1)`.NAs introduced by coercionProblem with `mutate()` input `Lang2_2`.
[34mℹ[39m NAs introduced by coercion
[34mℹ[39m Input `Lang2_2` is `.Primitive("as.double")(Lang2_2)`.NAs introduced by coercionProblem with `mutate()` input `Lang3_1`.
[34mℹ[39m NAs introduced by coercion
[34mℹ[39m Input `Lang3_1` is `.Primitive("as.double")(Lang3_1)`.NAs introduced by coercionProblem with `mutate()` input `Lang4_1`.
[34mℹ[39m NAs introduced by coercion
[34mℹ[39m Input `Lang4_1` is `.Primitive("as.double")(Lang4_1)`.NAs introduced by coercionProblem with `mutate()` input `Lang5_1`.
[34mℹ[39m NAs introduced by coercion
[34mℹ[39m Input `Lang5_1` is `.Primitive("as.double")(Lang5_1)`.NAs introduced by coercionProblem with `mutate()` input `Lang6_1`.
[34mℹ[39m NAs introduced by coercion
[34mℹ[39m Input `Lang6_1` is `.Primitive("as.double")(Lang6_1)`.NAs introduced by coercionProblem with `mutate()` input `Lang7_1`.
[34mℹ[39m NAs introduced by coercion
[34mℹ[39m Input `Lang7_1` is `.Primitive("as.double")(Lang7_1)`.NAs introduced by coercionProblem with `mutate()` input `Lang7_2`.
[34mℹ[39m NAs introduced by coercion
[34mℹ[39m Input `Lang7_2` is `.Primitive("as.double")(Lang7_2)`.NAs introduced by coercionProblem with `mutate()` input `LangSpeakFreq`.
[34mℹ[39m NAs introduced by coercion
[34mℹ[39m Input `LangSpeakFreq` is `.Primitive("as.double")(LangSpeakFreq)`.NAs introduced by coercionProblem with `mutate()` input `LangHearFreq`.
[34mℹ[39m NAs introduced by coercion
[34mℹ[39m Input `LangHearFreq` is `.Primitive("as.double")(LangHearFreq)`.NAs introduced by coercionProblem with `mutate()` input `SK1.WhereStandard`.
[34mℹ[39m NAs introduced by coercion
[34mℹ[39m Input `SK1.WhereStandard` is `.Primitive("as.double")(SK1.WhereStandard)`.NAs introduced by coercionProblem with `mutate()` input `IK2.CanWords`.
[34mℹ[39m NAs introduced by coercion
[34mℹ[39m Input `IK2.CanWords` is `.Primitive("as.double")(IK2.CanWords)`.NAs introduced by coercionProblem with `mutate()` input `EK1.CanSpeak.Diff`.
[34mℹ[39m NAs introduced by coercion
[34mℹ[39m Input `EK1.CanSpeak.Diff` is `.Primitive("as.double")(EK1.CanSpeak.Diff)`.NAs introduced by coercionProblem with `mutate()` input `EK2.CanPron.Diff`.
[34mℹ[39m NAs introduced by coercion
[34mℹ[39m Input `EK2.CanPron.Diff` is `.Primitive("as.double")(EK2.CanPron.Diff)`.NAs introduced by coercionProblem with `mutate()` input `EK3.CanAI.Diff_1_TEXT`.
[34mℹ[39m NAs introduced by coercion
[34mℹ[39m Input `EK3.CanAI.Diff_1_TEXT` is `.Primitive("as.double")(EK3.CanAI.Diff_1_TEXT)`.NAs introduced by coercionProblem with `mutate()` input `EK4.CanAU.Diff_1_TEXT`.
[34mℹ[39m NAs introduced by coercion
[34mℹ[39m Input `EK4.CanAU.Diff_1_TEXT` is `.Primitive("as.double")(EK4.CanAU.Diff_1_TEXT)`.NAs introduced by coercionProblem with `mutate()` input `PE1.Relatives_1_TEXT`.
[34mℹ[39m NAs introduced by coercion
[34mℹ[39m Input `PE1.Relatives_1_TEXT` is `.Primitive("as.double")(PE1.Relatives_1_TEXT)`.NAs introduced by coercionProblem with `mutate()` input `PE1.CloseFamFriends_1_TEXT`.
[34mℹ[39m NAs introduced by coercion
[34mℹ[39m Input `PE1.CloseFamFriends_1_TEXT` is `.Primitive("as.double")(PE1.CloseFamFriends_1_TEXT)`.NAs introduced by coercionProblem with `mutate()` input `ME1.TVShows`.
[34mℹ[39m NAs introduced by coercion
[34mℹ[39m Input `ME1.TVShows` is `.Primitive("as.double")(ME1.TVShows)`.NAs introduced by coercionProblem with `mutate()` input `ME3.Sources.OtherMedia_1_TEXT`.
[34mℹ[39m NAs introduced by coercion
[34mℹ[39m Input `ME3.Sources.OtherMedia_1_TEXT` is `.Primitive("as.double")(ME3.Sources.OtherMedia_1_TEXT)`.NAs introduced by coercionProblem with `mutate()` input `ME3.Sources.Other_1_TEXT`.
[34mℹ[39m NAs introduced by coercion
[34mℹ[39m Input `ME3.Sources.Other_1_TEXT` is `.Primitive("as.double")(ME3.Sources.Other_1_TEXT)`.NAs introduced by coercionProblem with `mutate()` input `dataFormat`.
[34mℹ[39m NAs introduced by coercion
[34mℹ[39m Input `dataFormat` is `.Primitive("as.double")(dataFormat)`.NAs introduced by coercion
quesdata.num.sub
# Select and Merge only relavant numerical columns of data for PCA analysis
quesdata.clean <- quesdata.demo %>%
select(participantId, conditionId, guiseCombination, speakerOrder, Age, Gender, Ethnicity) %>%
merge(., ik2.values) %>%
merge(., quesdata.num.sub) %>%
droplevels()
# quesdata.final
quesdata.clean
# Check correlations, which motivate the use of PCA to reduce dimensionality
with(quesdata.clean, cor.test(IK2.ai, IK2.au))
# Test correlations of similar questions
with(quesdata.clean, cor.test(Travel.Can_Visits, Travel.Can_Time))
with(quesdata.clean, cor.test(SE1.Fam.oot_1, SE2.Freq.Overall_1))
with(quesdata.clean, cor.test(SE2.Freq.Overall_1, SE2.Freq.Recent_1))
with(quesdata.clean, cor.test(SE2.Freq.Overall_1, SE2.Freq.Child_1))
with(quesdata.clean, cor.test(SE2.Freq.Recent_1, SE2.Freq.Child_1))
Packages required for this PCA: * PCA command from FactoMineR library (see index for more info) * paran command from paran library * Varimax command from GPArotations library (https://stats.stackexchange.com/questions/59213/how-to-compute-varimax-rotated-principal-components-in-r)
Packages for visualization of PCA * fviz_pca_ind and fviz_pca_biplot from factoextra
# (0) Select data for PCA — only numerical columns
# names(quesdata.clean)
# Medium trimmed set — removes general Canadian stereotype experience/knowledge, imitation, Sources of CE, hockey
quesdata.pca.med <- select(quesdata.clean,
IK2.au, IK2.ai,
SE1.Fam.oot_1, SE2.Freq.Recent_1:SE2.Freq.Overall_1,
PE2.CanSpeakFreq.Recent_1:PE2.CanSpeakFreq.Overall_1,
ME4.CanHearFreq.Recent_1:ME4.CanHearFreq.Overall_1)
quesdata.pca.med
NA
## (1) Run Parallel Analysis with `paran`
# Standard way to decide on the number of factors or components needed in an FA or PCA.
# Prints out a scree plot as well, with the randomized line + unadjusted line
paran(quesdata.pca.med,
graph = TRUE, color = TRUE,
col = c("black", "red", "blue"), lty = c(1, 2, 3), lwd = 1, legend = TRUE,
file = "", width = 640, height = 640, grdevice = "png", seed = 0)
Using eigendecomposition of correlation matrix.
Computing: 10% 20% 30% 40% 50% 60% 70% 80% 90% 100%
Results of Horn's Parallel Analysis for component retention
360 iterations, using the mean estimate
--------------------------------------------------
Component Adjusted Unadjusted Estimated
Eigenvalue Eigenvalue Bias
--------------------------------------------------
1 4.923608 5.569209 0.645600
2 1.014315 1.477026 0.462710
--------------------------------------------------
Adjusted eigenvalues > 1 indicate dimensions to retain.
(2 components retained)
## (2) Run PCA with `FactoMineR`
# ncp = number of components; adjust after checking the parallel analysis output
# FactoMineR PCA Commands
#plbqPCA # lists commands
#plbqPCA$var # variables
#plbqPCA$ind # individuals
#plbqPCA$call # summary stats
lbqPCA <- PCA(quesdata.pca.med, scale.unit = T, ncp =2, graph=F)
## Relevant Raw PCA Output
# Eigenvalues & percent variance accounted for
(eigenvalues <- lbqPCA$eig)
eigenvalue percentage of variance cumulative percentage of variance
comp 1 5.56920934 46.4100778 46.41008
comp 2 1.47702633 12.3085527 58.71863
comp 3 1.17513102 9.7927585 68.51139
comp 4 1.08602655 9.0502212 77.56161
comp 5 0.77331255 6.4442713 84.00588
comp 6 0.64543605 5.3786337 89.38452
comp 7 0.52683655 4.3903046 93.77482
comp 8 0.29948597 2.4957164 96.27054
comp 9 0.18393724 1.5328103 97.80335
comp 10 0.13160788 1.0967324 98.90008
comp 11 0.09079792 0.7566494 99.65673
comp 12 0.04119260 0.3432717 100.00000
# Eigenvectors (=Factor matrix, factor score coefficients; sometimes called the factor, but NOT factor scores)
(eigenvectors <- lbqPCA$var$coord)
Dim.1 Dim.2
IK2.au 0.3447922 0.42338649
IK2.ai -0.1448258 -0.29672009
SE1.Fam.oot_1 0.5141354 0.29038508
SE2.Freq.Recent_1 0.7477153 0.29500278
SE2.Freq.Child_1 0.6827982 0.47880752
SE2.Freq.Overall_1 0.7706314 0.41458997
PE2.CanSpeakFreq.Recent_1 0.7657419 -0.49904588
PE2.CanSpeakFreq.Child_1 0.6253158 -0.05611207
PE2.CanSpeakFreq.Overall_1 0.8219899 -0.44469439
ME4.CanHearFreq.Recent_1 0.7866714 -0.32268452
ME4.CanHearFreq.Child_1 0.7453443 0.08655559
ME4.CanHearFreq.Overall_1 0.8475941 -0.27508210
# Factor loadings (eigenvectors scaled by the square root of their associated eigenvalues)
# Calculate factor loadings using the output eigenvectors and eigenvalues
rawLoadings <- sweep(lbqPCA$var$coord,2,sqrt(lbqPCA$eig[1:ncol(lbqPCA$var$coord),1]),FUN="/")
rawLoadings
Dim.1 Dim.2
IK2.au 0.14610351 0.34837171
IK2.ai -0.06136901 -0.24414781
SE1.Fam.oot_1 0.21786163 0.23893522
SE2.Freq.Recent_1 0.31683962 0.24273477
SE2.Freq.Child_1 0.28933140 0.39397335
SE2.Freq.Overall_1 0.32655016 0.34113374
PE2.CanSpeakFreq.Recent_1 0.32447830 -0.41062592
PE2.CanSpeakFreq.Child_1 0.26497363 -0.04617024
PE2.CanSpeakFreq.Overall_1 0.34831302 -0.36590431
ME4.CanHearFreq.Recent_1 0.33334703 -0.26551191
ME4.CanHearFreq.Child_1 0.31583494 0.07121984
ME4.CanHearFreq.Overall_1 0.35916263 -0.22634360
# Factor scores for each subject and dimension (also: Individual coordinate scores; principle coordinates)
rawScores <- lbqPCA$ind$coord
## (3) Conduct rotation on the PCA factor loadings with `GPArotation`
# Rotations are typically done on the retained component factor loadings, not on all components nor on the eigenvectors
# Performed for ease of interpretation, maximizing factor loadings
(rotLoadings <- Varimax(rawLoadings)$loadings)
Dim.1 Dim.2
IK2.au -0.130050509 0.35467725
IK2.ai 0.121199839 -0.22064656
SE1.Fam.oot_1 -0.002994765 0.32333383
SE2.Freq.Recent_1 0.066937539 0.39348047
SE2.Freq.Child_1 -0.056147458 0.48556681
SE2.Freq.Overall_1 0.007083407 0.47218329
PE2.CanSpeakFreq.Recent_1 0.517200961 -0.08001862
PE2.CanSpeakFreq.Child_1 0.225560983 0.14650924
PE2.CanSpeakFreq.Overall_1 0.504227143 -0.03103091
ME4.CanHearFreq.Recent_1 0.424936553 0.03233799
ME4.CanHearFreq.Child_1 0.182931745 0.26713284
ME4.CanHearFreq.Overall_1 0.417193540 0.07860512
# Recover Rotation matrix from loadings
# Because the rotLoadings are calculated from rawLoadings %*% rotMatrix, can recover rotMatrix by rotLoadings "divided" by rawLoadings, which in matrix multiplication is multiplying by the inverse (transpose)
# Note: For some reason, can't call Varimax(rawLoadings)$rotmat (just get NULL); this recreates the same matrix from Varimax(rawLoadings)
(rotMatrixL <- t(rawLoadings) %*% rotLoadings)
Dim.1 Dim.2
Dim.1 0.7326701 0.6805839
Dim.2 -0.6805839 0.7326701
# Calculate rotated factor scores
# The formula simply multiplies the normalized variable scores with the rotation matrix to get rotated factor scores
# First, z-score the raw scores using base R scale()
# Then, matrix multiply the matrix of zScores with the rotation matrix
# Result is a matrix with columns=components and rows=each subject
zScores <- scale(rawScores)
rotScores <- zScores %*% rotMatrixL
## (4) Data Visualization of Raw Scores with `factoextra`
# Plot individual factor scores
fviz_pca_ind(lbqPCA, col.ind = "#00AFBB", repel = TRUE)
# Biplot, including individual scores and factor vectors
fviz_pca_biplot(lbqPCA, label = "all", col.ind = "#00AFBB", col.var="black", ggtheme = theme_minimal())
## (5) Manual Plots of Rotated Scores with `ggplot`
## Create dataframes of the rotated factor loading and factor score matrices
# Convert rotated factor loadings matrix to data frame; add variable number
rotLoadingsData <- as.data.frame(rotLoadings)
rotLoadingsData <- mutate(rotLoadingsData, variable = row.names(rotLoadings))
rotLoadingsData <- mutate(rotLoadingsData, variable = factor(variable))
# Convert rotated factor score matrix to data frame; add subject number
rotScoreData <- as.data.frame(rotScores)
rotScoreData <- rotScoreData %>% mutate(subject = 1:nrow(.))
## Create base plots
# Loading plot
loadingplot <- rotLoadingsData %>% ggplot(aes(x=Dim.1, y=Dim.2))+
geom_segment(data=rotLoadingsData, mapping=aes(x=0, y=0, xend=Dim.1*4, yend=Dim.2*4), arrow=arrow(), size=0.5, color="black") +
geom_text(data=rotLoadingsData, aes(x=Dim.1*4, y=Dim.2*4, label=variable), color="red",check_overlap=T) +
scale_x_continuous(lim=c(-2.75, 2.75),breaks=seq(-3,3,1)) +
scale_y_continuous(lim=c(-3.5, 3.5),breaks=seq(-3,3,1)) +
geom_hline(yintercept=0, linetype="dashed") +
geom_vline(xintercept=0, linetype="dashed") +
labs(title="Variables - PCA", x="Dim 1 (36.4%)", y="Dim 2 (17.7%)") +
gg_theme()
loadingplot
# Scatter plot of Individual factor scores
dimplot = ggplot(rotScoreData, aes(x=Dim.1, y=Dim.2))+
geom_point(na.rm=TRUE, color="#00AFBB") +
geom_text(aes(label=subject),hjust=1.5,vjust=1.5, color="#00AFBB", check_overlap=T)+
scale_x_continuous(lim=c(-2.75, 2.75),breaks=seq(-3,3,1)) +
scale_y_continuous(lim=c(-3.5, 3.5),breaks=seq(-3,3,1)) +
geom_hline(yintercept=0, linetype="dashed") +
geom_vline(xintercept=0, linetype="dashed") +
labs(title="Individuals - PCA", x="Dim 1 (36.4%)", y="Dim 2 (17.7%)") +
gg_theme()
dimplot
## Merge loading and score plot = Biplot
# Biplot of factor loadings + ind factor scores
ggplot(rotScoreData, aes(x=Dim.1, y=Dim.2))+
geom_point(na.rm=TRUE, color="#00AFBB") +
geom_text(aes(label=subject),hjust=1.5,vjust=1.5, color="#00AFBB", check_overlap=T)+
# Overlay loading plot (i.e. arrows)
geom_segment(data=rotLoadingsData, mapping=aes(x=0, y=0, xend=Dim.1*4, yend=Dim.2*4), arrow=arrow(), size=0.5, color="black") +
geom_text(data=rotLoadingsData, aes(x=Dim.1*4.5, y=Dim.2*4.5, label=variable), color="red",check_overlap=T, nudge_y = 0)+
scale_x_continuous(lim=c(-2.75, 2.75),breaks=seq(-3,3,1)) +
scale_y_continuous(lim=c(-3.5, 3.5),breaks=seq(-3,3,1)) +
geom_hline(yintercept=0, linetype="dashed") +
geom_vline(xintercept=0, linetype="dashed") +
labs(title="Biplot - PCA", x="Dim 1 (36.4%)", y="Dim 2 (17.7%)") +
gg_theme()
## Interpret PCs (Dimensions) based on factor loadings
rotLoadings.df <- as.data.frame(rotLoadings) %>%
rownames_to_column(., "Variables") %>%
rename(., "PC1"= "Dim.1", "PC2" = "Dim.2")
rotLoadings.df
# PC1 Only Contributors
rotLoadings.df %>% filter(abs(PC1) > 0.2)
# PC2 Only Contributors
rotLoadings.df %>% filter(abs(PC2) > 0.2)
# Check for overlapping contributors
rotLoadings.df %>% filter(abs(PC2) > 0.2 & abs(PC1) > 0.2)
# Check for non-contributors
rotLoadings.df %>% filter(abs(PC2) < 0.2 & abs(PC1) < 0.2)
# For Merging: Convert rotated factor score matrix to data frame; add participantId (assuming order of input dataframe)
There were 13 warnings (use warnings() to see them)
indPCAdata <- rotScoreData %>%
mutate(participantId = quesdata.clean$participantId) %>%
rename(CEscore = Dim.1) %>%
rename(SAscore = Dim.2)
# Merge participant data with PC scores
# Only select the main relevant scores
quesdata.final <- quesdata.clean %>%
merge(., indPCAdata) %>%
mutate(MSscore = scale(SK2.MichvStand)) %>%
mutate(EQscore = scale(EQ.raws)) %>%
mutate_at(vars(Age), as.numeric) %>%
mutate_if(is.character, as.factor) %>%
select(participantId, conditionId, guiseCombination, speakerOrder, Age, Gender, Ethnicity, CEscore, SAscore, MSscore, EQscore, EQ.raws, everything())
summary(quesdata.final)
participantId conditionId guiseCombination speakerOrder Age Gender Ethnicity CEscore
56d9f54b610fc0000bb76e8d: 1 condA:32 baseline:31 S3-S9:90 Min. :18.00 f :31 asian : 6 Min. :-1.6202
56e72067c89073000be77fdf: 1 condC:27 match :32 1st Qu.:22.00 m :56 black : 9 1st Qu.:-0.7392
59c5689f46f72100019066a7: 1 condE:31 mismatch:27 Median :27.00 nb: 3 hispanic : 2 Median :-0.2443
5a412f0b99311d0001df431e: 1 Mean :29.78 middle-eastern: 2 Mean : 0.0000
5b14791ebd9c31000156e8ab: 1 3rd Qu.:33.00 multiracial : 2 3rd Qu.: 0.6814
5bef64b373e44f0001092811: 1 Max. :68.00 white :69 Max. : 2.6322
(Other) :84
SAscore MSscore.V1 EQscore.V1 EQ.raws IK2.au IK2.ai IK2.au.out
Min. :-2.73135 Min. :-2.9560813 Min. :-2.2862673 Min. :12.00 Min. :0.000 Min. :0.0000 Min. :0.0000
1st Qu.:-0.53157 1st Qu.:-0.3979340 1st Qu.:-0.6133888 1st Qu.:33.00 1st Qu.:2.000 1st Qu.:0.0000 1st Qu.:0.0000
Median : 0.09743 Median : 0.4547817 Median :-0.1752539 Median :38.50 Median :4.000 Median :0.0000 Median :1.0000
Mean : 0.00000 Mean : 0.0000000 Mean : 0.0000000 Mean :40.70 Mean :3.344 Mean :0.9333 Mean :0.6556
3rd Qu.: 0.71486 3rd Qu.: 0.4547817 3rd Qu.: 0.7209310 3rd Qu.:49.75 3rd Qu.:5.000 3rd Qu.:1.0000 3rd Qu.:1.0000
Max. : 1.79057 Max. : 1.3074975 Max. : 2.3340639 Max. :70.00 Max. :5.000 Max. :5.0000 Max. :1.0000
IK2.au.about IK2.au.devout IK2.au.house IK2.au.pouch IK2.ai.like IK2.ai.right IK2.ai.might IK2.ai.unite
Min. :0.0000 Min. :0.0000 Min. :0.0000 Min. :0.0000 Min. :0.0000 Min. :0.0000 Min. :0.0000 Min. :0.0000
1st Qu.:1.0000 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.0000
Median :1.0000 Median :1.0000 Median :1.0000 Median :1.0000 Median :0.0000 Median :0.0000 Median :0.0000 Median :0.0000
Mean :0.7778 Mean :0.7333 Mean :0.5889 Mean :0.5889 Mean :0.1444 Mean :0.3222 Mean :0.1889 Mean :0.1444
3rd Qu.:1.0000 3rd Qu.:1.0000 3rd Qu.:1.0000 3rd Qu.:1.0000 3rd Qu.:0.0000 3rd Qu.:1.0000 3rd Qu.:0.0000 3rd Qu.:0.0000
Max. :1.0000 Max. :1.0000 Max. :1.0000 Max. :1.0000 Max. :1.0000 Max. :1.0000 Max. :1.0000 Max. :1.0000
IK2.ai.ripe Travel.NE_Visits Travel.S_Visits Travel.W_Visits Travel.Can_Visits Travel.NE_Time Travel.S_Time Travel.W_Time
Min. :0.0000 Min. :1.000 Min. :1.0 Min. :1.000 Min. :1.000 Min. :1.000 Min. :1.000 Min. :1.000
1st Qu.:0.0000 1st Qu.:1.000 1st Qu.:2.0 1st Qu.:1.000 1st Qu.:1.000 1st Qu.:1.000 1st Qu.:1.000 1st Qu.:1.000
Median :0.0000 Median :2.000 Median :2.0 Median :1.500 Median :2.000 Median :1.000 Median :1.000 Median :1.000
Mean :0.1333 Mean :1.811 Mean :2.3 Mean :1.678 Mean :1.911 Mean :1.356 Mean :1.556 Mean :1.222
3rd Qu.:0.0000 3rd Qu.:2.000 3rd Qu.:3.0 3rd Qu.:2.000 3rd Qu.:2.000 3rd Qu.:1.000 3rd Qu.:2.000 3rd Qu.:1.000
Max. :1.0000 Max. :4.000 Max. :4.0 Max. :4.000 Max. :4.000 Max. :4.000 Max. :4.000 Max. :4.000
Travel.Can_Time SK2.MichvStand SK3.CanvStand IK1.DecideCanada EK1.CanSpeak EK2.CanPron EK3.CanAI EK3.CanAI.Diff
Min. :1.000 Min. :2.000 Min. :2.000 Min. :2.000 Min. :1.000 Min. :1.000 Min. :1.000 Min. :1.000
1st Qu.:1.000 1st Qu.:5.000 1st Qu.:4.000 1st Qu.:2.000 1st Qu.:1.000 1st Qu.:1.000 1st Qu.:1.000 1st Qu.:2.000
Median :1.000 Median :6.000 Median :4.000 Median :3.000 Median :1.000 Median :1.000 Median :2.000 Median :2.000
Mean :1.167 Mean :5.467 Mean :4.322 Mean :3.278 Mean :1.067 Mean :1.067 Mean :1.689 Mean :1.767
3rd Qu.:1.000 3rd Qu.:6.000 3rd Qu.:5.000 3rd Qu.:4.000 3rd Qu.:1.000 3rd Qu.:1.000 3rd Qu.:2.000 3rd Qu.:2.000
Max. :4.000 Max. :7.000 Max. :7.000 Max. :5.000 Max. :2.000 Max. :2.000 Max. :2.000 Max. :2.000
EK4.CanAU EK4.CanAU.Diff SE1.Fam.eh_1 SE1.Fam.oot_1 SE1.Fam.sorry_1 SE1.Fam.washroom_1 SE1.Fam.nasal_1 SE2.Freq.Recent_1
Min. :1.000 Min. :1.000 Min. :1.000 Min. :1.000 Min. :1.000 Min. :1.000 Min. :1.000 Min. :1.0
1st Qu.:1.000 1st Qu.:1.000 1st Qu.:6.000 1st Qu.:5.000 1st Qu.:2.250 1st Qu.:1.000 1st Qu.:1.000 1st Qu.:3.0
Median :1.000 Median :1.000 Median :7.000 Median :7.000 Median :6.000 Median :3.000 Median :3.000 Median :4.0
Mean :1.089 Mean :1.233 Mean :6.089 Mean :5.889 Mean :4.822 Mean :3.356 Mean :3.156 Mean :4.1
3rd Qu.:1.000 3rd Qu.:1.000 3rd Qu.:7.000 3rd Qu.:7.000 3rd Qu.:7.000 3rd Qu.:5.000 3rd Qu.:4.750 3rd Qu.:6.0
Max. :2.000 Max. :2.000 Max. :7.000 Max. :7.000 Max. :7.000 Max. :7.000 Max. :7.000 Max. :7.0
SE2.Freq.Child_1 SE2.Freq.Overall_1 SE3.Accuracy SE4.CanHearImitate SE4.auHearImitate SE5.CanImitate SE5.auImitate PE1.KnowCan
Min. :1.000 Min. :1.000 Min. :1.000 Min. :1.000 Min. :1.000 Min. :1.000 Min. :1.0 Min. :1.000
1st Qu.:2.000 1st Qu.:3.000 1st Qu.:5.000 1st Qu.:2.000 1st Qu.:3.000 1st Qu.:1.000 1st Qu.:1.0 1st Qu.:1.000
Median :4.000 Median :4.000 Median :5.000 Median :3.000 Median :6.000 Median :2.000 Median :3.0 Median :1.000
Mean :3.678 Mean :4.278 Mean :4.944 Mean :3.233 Mean :4.833 Mean :2.244 Mean :3.5 Mean :1.378
3rd Qu.:5.000 3rd Qu.:5.000 3rd Qu.:6.000 3rd Qu.:4.000 3rd Qu.:6.000 3rd Qu.:3.000 3rd Qu.:6.0 3rd Qu.:2.000
Max. :7.000 Max. :7.000 Max. :7.000 Max. :6.000 Max. :7.000 Max. :6.000 Max. :7.0 Max. :2.000
PE1.Relatives PE1.CloseFamFriends PE2.CanSpeakFreq.Recent_1 PE2.CanSpeakFreq.Child_1 PE2.CanSpeakFreq.Overall_1 ME2.HockeyFreq
Min. :1.000 Min. :1.000 Min. :1.000 Min. :1.000 Min. :1.000 Min. :1.0
1st Qu.:2.000 1st Qu.:1.000 1st Qu.:2.000 1st Qu.:1.000 1st Qu.:2.000 1st Qu.:1.0
Median :2.000 Median :2.000 Median :3.000 Median :2.000 Median :3.000 Median :2.0
Mean :1.933 Mean :1.689 Mean :3.233 Mean :2.456 Mean :3.111 Mean :2.5
3rd Qu.:2.000 3rd Qu.:2.000 3rd Qu.:4.000 3rd Qu.:3.000 3rd Qu.:4.000 3rd Qu.:4.0
Max. :2.000 Max. :2.000 Max. :7.000 Max. :7.000 Max. :7.000 Max. :7.0
ME3.Sources.People_1 ME3.Sources.TV_1 ME3.Sources.News_1 ME3.Sources.Hockey_1 ME3.Sources.Online_1 ME3.Sources.OtherMedia_1
Min. :1.00 Min. :1.000 Min. :1.000 Min. :1.000 Min. :1.000 Min. :1.000
1st Qu.:1.00 1st Qu.:2.000 1st Qu.:1.000 1st Qu.:1.000 1st Qu.:2.000 1st Qu.:1.000
Median :2.00 Median :3.000 Median :2.000 Median :2.000 Median :3.000 Median :1.000
Mean :2.50 Mean :2.889 Mean :2.256 Mean :2.444 Mean :3.067 Mean :1.122
3rd Qu.:3.75 3rd Qu.:4.000 3rd Qu.:3.000 3rd Qu.:3.750 3rd Qu.:4.000 3rd Qu.:1.000
Max. :7.00 Max. :7.000 Max. :7.000 Max. :7.000 Max. :7.000 Max. :5.000
ME3.Sources.Other_1 ME4.CanHearFreq.Recent_1 ME4.CanHearFreq.Child_1 ME4.CanHearFreq.Overall_1 tonesCorrect subject
Min. :1.000 Min. :1.000 Min. :1.0 Min. :1.000 Min. :0.000 Min. : 1.00
1st Qu.:1.000 1st Qu.:2.250 1st Qu.:2.0 1st Qu.:2.000 1st Qu.:5.000 1st Qu.:23.25
Median :1.000 Median :3.000 Median :2.5 Median :3.000 Median :6.000 Median :45.50
Mean :1.144 Mean :3.622 Mean :2.8 Mean :3.322 Mean :5.522 Mean :45.50
3rd Qu.:1.000 3rd Qu.:5.000 3rd Qu.:4.0 3rd Qu.:4.000 3rd Qu.:6.000 3rd Qu.:67.75
Max. :5.000 Max. :7.000 Max. :7.0 Max. :7.000 Max. :6.000 Max. :90.00
quesdata.final
# Write to file
write.csv(quesdata.final, 'data/axb_1b_lbq_data.csv', row.names=F)
# Total
quesdata.final %>% select(participantId, conditionId, guiseCombination, Age, Gender, Ethnicity, CEscore, SAscore, MSscore, EQscore, EQ.raws) %>%
summarise_if(is.numeric, list(mean = mean, sd = sd, min = min, max = max)) %>%
pivot_longer(everything(), names_to = c("Measure", "Stat"), names_sep = "_", values_to = "value") %>%
pivot_wider(Measure, names_from = "Stat", values_from = "value") %>%
mutate(mean = round(mean, digits = 6)) %>%
mutate(group = "Total") %>%
select(group, everything())
?summarise_if
# Baseline
quesdata.final %>% select(participantId, conditionId, guiseCombination, Age, Gender, Ethnicity, CEscore, SAscore, MSscore, EQscore, EQ.raws) %>%
filter(guiseCombination == "baseline") %>%
summarise_if(is.numeric, list(mean = mean, sd = sd, min = min, max = max)) %>%
pivot_longer(everything(), names_to = c("Measure", "Stat"), names_sep = "_", values_to = "value") %>%
pivot_wider(Measure, names_from = "Stat", values_from = "value") %>%
mutate(mean = round(mean, digits = 6)) %>%
mutate(group = "baseline") %>%
select(group, everything())
# Match
quesdata.final %>% select(participantId, conditionId, guiseCombination, Age, Gender, Ethnicity, CEscore, SAscore, MSscore, EQscore, EQ.raws) %>%
filter(guiseCombination == "match") %>%
summarise_if(is.numeric, list(mean = mean, sd = sd, min = min, max = max)) %>%
pivot_longer(everything(), names_to = c("Measure", "Stat"), names_sep = "_", values_to = "value") %>%
pivot_wider(Measure, names_from = "Stat", values_from = "value") %>%
mutate(mean = round(mean, digits = 6)) %>%
mutate(group = "match") %>%
select(group, everything())
# Mismatch
quesdata.final %>% select(participantId, conditionId, guiseCombination, Age, Gender, Ethnicity, CEscore, SAscore, MSscore, EQscore, EQ.raws) %>%
filter(guiseCombination == "mismatch") %>%
summarise_if(is.numeric, list(mean = mean, sd = sd, min = min, max = max)) %>%
pivot_longer(everything(), names_to = c("Measure", "Stat"), names_sep = "_", values_to = "value") %>%
pivot_wider(Measure, names_from = "Stat", values_from = "value") %>%
mutate(mean = round(mean, digits = 6)) %>%
mutate(group = "mismatch") %>%
select(group, everything())
#quesdata.final.sum <-
quesdata.final %>% group_by(guiseCombination) %>%
summarize(#meanEQ = mean(EQscore),
meanIK2.au = mean(IK2.au), meanIK2.ai = mean(IK2.ai),
#meanDecideCan = mean(IK1.DecideCanada),
#meanEhFam = mean(SE1.Fam.eh_1),
meanOotFam = mean(SE1.Fam.oot_1),
meanSEFreq = mean(SE2.Freq.Overall_1), meanSEFreq.C = mean(SE2.Freq.Child_1),
meanSEFreq.R = mean(SE2.Freq.Recent_1),
meanSEAcc = mean(SE3.Accuracy))
#quesdata.final.sum <-
quesdata.final %>% group_by(guiseCombination) %>%
summarize(CanHearFreq = mean(ME4.CanHearFreq.Overall_1),
CanHearFreq.R = mean(ME4.CanHearFreq.Recent_1),
CanHearFreq.C = mean(ME4.CanHearFreq.Child_1),
CanSpeakFreq = mean(PE2.CanSpeakFreq.Overall_1),
CanSpeakFreq.R = mean(PE2.CanSpeakFreq.Recent_1),
CanSpeakFreq.C = mean(PE2.CanSpeakFreq.Child_1))
quesdata.final %>% group_by(guiseCombination) %>% mutate(EK1.CanSpeak = ifelse(EK1.CanSpeak==1, "yes", "no")) %>%
count(EK1.CanSpeak) %>%
pivot_wider(names_from = EK1.CanSpeak, values_from = n) %>%
mutate(question="SpeakDiff") %>% relocate(question)
quesdata.final %>% group_by(guiseCombination) %>% mutate(EK2.CanPron = ifelse(EK2.CanPron==1, "yes", "no")) %>%
count(EK2.CanPron) %>%
pivot_wider(names_from = EK2.CanPron, values_from = n) %>%
mutate(question="PronounceDiff") %>% relocate(question)
quesdata.final %>% group_by(guiseCombination) %>% mutate(EK3.CanAI = ifelse(EK3.CanAI==1, "yes", "no")) %>%
count(EK3.CanAI) %>%
pivot_wider(names_from = EK3.CanAI, values_from = n) %>%
mutate(prop.yes = yes/(yes+no)) %>%
mutate(question="aiDiff") %>% relocate(question)
quesdata.final %>% group_by(guiseCombination) %>% mutate(EK4.CanAU = ifelse(EK4.CanAU==1, "yes", "no")) %>%
count(EK4.CanAU) %>%
pivot_wider(names_from = EK4.CanAU, values_from = n) %>%
mutate(prop.yes = yes/(yes+no)) %>%
mutate(question="auDiff") %>% relocate(question)
# Overall total
quesdata.final %>% mutate(EK3.CanAI = ifelse(EK3.CanAI==1, "yes", "no")) %>% count(EK3.CanAI) %>%
pivot_wider(names_from = EK3.CanAI, values_from = n) %>% mutate(prop.yes = yes/(yes+no)) %>%
mutate(question="aiDiff") %>% relocate(question) %>%
rbind(.,quesdata.final %>% mutate(EK4.CanAU = ifelse(EK4.CanAU==1, "yes", "no")) %>% count(EK4.CanAU) %>%
pivot_wider(names_from = EK4.CanAU, values_from = n)%>% mutate(prop.yes = yes/(yes+no)) %>%
mutate(question="auDiff") %>% relocate(question)
)
NA
# By Gender
ggplot(data=quesdata.final, aes(x=Gender, y=EQscore)) +
geom_boxplot(aes(linetype = Gender), fill="grey", alpha=0.3, na.rm=TRUE) +
gg_theme()
# By Gender + Guise
ggplot(data=quesdata.final, aes(x=Gender, y=EQscore)) +
geom_boxplot(aes(fill = guiseCombination, color=guiseCombination, linetype=Gender), alpha=0.3, na.rm=TRUE) +
facet_grid(~guiseCombination) +
scale_color_manual(values=ghibli_palette("PonyoMedium")[c(6,3,4)]) +
scale_fill_manual(values=ghibli_palette("PonyoMedium")[c(6,3,4)]) +
gg_theme()
# By Guise
ggplot(data=quesdata.final, aes(x=guiseCombination, y=EQscore)) +
geom_boxplot(aes(fill = guiseCombination, color=guiseCombination), alpha=0.3, na.rm=TRUE) +
scale_color_manual(values=ghibli_palette("PonyoMedium")[c(6,3,4)]) +
scale_fill_manual(values=ghibli_palette("PonyoMedium")[c(6,3,4)]) +
gg_theme()
# Density Plot of score distributions
quesdata.final %>% mutate(guiseCombination = plyr::mapvalues(guiseCombination, from = c("baseline", "match", "mismatch"), to = c("No-Guise", "Guise-Match", "Guise-Mismatch"))) %>%
ggplot(aes(x=SAscore,fill=guiseCombination,color=guiseCombination))+
geom_density(alpha=0.3) +
labs(title="", x="SA Score", y="Count") +
gg_theme()
# By Gender
ggplot(data=quesdata.final, aes(x=Gender, y=SAscore)) +
geom_boxplot(aes(linetype = Gender), fill="grey", alpha=0.3, na.rm=TRUE) +
gg_theme()
# By Gender + Guise
ggplot(data=quesdata.final, aes(x=Gender, y=SAscore)) +
geom_boxplot(aes(fill = guiseCombination, color=guiseCombination, linetype=Gender), alpha=0.3, na.rm=TRUE) +
facet_grid(~guiseCombination) +
scale_color_manual(values=ghibli_palette("PonyoMedium")[c(6,3,4)]) +
scale_fill_manual(values=ghibli_palette("PonyoMedium")[c(6,3,4)]) +
gg_theme()
# By Guise
ggplot(data=quesdata.final, aes(x=guiseCombination, y=SAscore)) +
geom_boxplot(aes(fill = guiseCombination, color=guiseCombination), alpha=0.3, na.rm=TRUE) +
scale_color_manual(values=ghibli_palette("PonyoMedium")[c(6,3,4)]) +
scale_fill_manual(values=ghibli_palette("PonyoMedium")[c(6,3,4)]) +
gg_theme()
# Density Plot of score distributions
There were 16 warnings (use warnings() to see them)
quesdata.final %>% mutate(guiseCombination = plyr::mapvalues(guiseCombination, from = c("baseline", "match", "mismatch"), to = c("No-Guise", "Guise-Match", "Guise-Mismatch"))) %>%
ggplot(aes(x=CEscore,fill=guiseCombination,color=guiseCombination))+
geom_density(alpha=0.3) +
labs(title="", x="CE Score", y="Count") +
gg_theme()
# By Gender
ggplot(data=quesdata.final, aes(x=Gender, y=CEscore)) +
geom_boxplot(aes(linetype = Gender), fill="grey", alpha=0.3, na.rm=TRUE) +
gg_theme()
# By Gender + Guise
ggplot(data=quesdata.final, aes(x=Gender, y=CEscore)) +
geom_boxplot(aes(fill = guiseCombination, color=guiseCombination, linetype=Gender), alpha=0.3, na.rm=TRUE) +
facet_grid(~guiseCombination) +
scale_color_manual(values=ghibli_palette("PonyoMedium")[c(6,3,4)]) +
scale_fill_manual(values=ghibli_palette("PonyoMedium")[c(6,3,4)]) +
gg_theme()
# By Guise
ggplot(data=quesdata.final, aes(x=guiseCombination, y=CEscore)) +
geom_boxplot(aes(fill = guiseCombination, color=guiseCombination), alpha=0.3, na.rm=TRUE) +
scale_color_manual(values=ghibli_palette("PonyoMedium")[c(6,3,4)]) +
scale_fill_manual(values=ghibli_palette("PonyoMedium")[c(6,3,4)]) +
gg_theme()
CSCE.biplot <- quesdata.final %>% mutate(guiseCombination = plyr::mapvalues(guiseCombination, from = c("baseline", "match", "mismatch"), to = c("No-Guise", "Guise-Match", "Guise-Mismatch"))) %>%
ggplot(aes(y=CEscore, x=SAscore, color=guiseCombination, shape=guiseCombination)) +
geom_point(na.rm=TRUE, size=3, alpha=0.7) +
#geom_text(aes(label=subject),hjust=1.5,vjust=1.5, color="#00AFBB", check_overlap=T)+
scale_x_continuous(lim=c(-2.5, 2.5),breaks=seq(-3,3,1)) +
scale_y_continuous(lim=c(-3.5, 3.5),breaks=seq(-3,3,1)) +
geom_hline(yintercept=0, linetype="dashed") +
geom_vline(xintercept=0, linetype="dashed") +
labs(y="CE Scores (PC1)", x="SA Scores (PC2)", color="Participant Group", shape="Participant Group") +
scale_color_manual(values=ghibli_palette("PonyoMedium")[c(6,3,4)]) +
gg_theme()
CSCE.biplot
#ggsave(path="plots", filename="CS-CE_distribution_plot.png", CSCE.biplot, width=16, height=8, units = "in" , dpi=72)
quesdata.final %>%
ggplot(aes(x = SAscore, y = EQscore)) +
geom_point(stat="identity", aes(colour = factor(Gender)), cex=2) +
geom_smooth(method="lm") +
labs(y = "EQ Score", x = "SA Score", color="Gender",
title="Correlations: By Speaker") +
scale_color_manual(values=ghibli_palette("PonyoLight")[c(4,3,2)])+
gg_theme()
quesdata.final %>%
ggplot(aes(x = SAscore, y = CEscore)) +
geom_point(stat="identity", aes(colour = factor(Gender)), cex=2) +
geom_smooth(method="lm") +
labs(y = "CE Score", x = "SA Score", color="Gender",
title="Correlations: By Speaker") +
scale_color_manual(values=ghibli_palette("PonyoLight")[c(4,3,2)])+
gg_theme()
quesdata.final %>%
ggplot(aes(x = CEscore, y = EQscore)) +
geom_point(stat="identity", aes(colour = factor(Gender)), cex=2) +
geom_smooth(method="lm") +
labs(y = "EQ Score", x = "CE Score", color="Gender",
title="Correlations: By Speaker") +
scale_color_manual(values=ghibli_palette("PonyoLight")[c(4,3,2)])+
gg_theme()